Should be a total of 121 files named: Cluster{a}.{b}.RA{c}.csv where:
# Import cluster data from directory with cluster csv files
# for example: Cluster1.1.RA1.csv, Cluster1.2.RA1.csv, ...
cluster.files <- grep("Cluster", list.files("../../data/", full.names = TRUE), value = TRUE)
d <- do.call(rbind, strsplit(do.call(rbind, strsplit(cluster.files, "/"))[, 5], "\\."))
d[, 1] <- substr(d[, 1], start = 8, stop = 8)
d <- data.frame(cluster = as.numeric(d[, 1]),
section = as.numeric(d[, 2]),
patient = d[, 3],
id = paste0(d[, 3], "x", d[, 2]))
d$path <- cluster.files
# Display data for html report
DT::datatable(head(d))
Should be a total of 27 files named: RA{a}Raw.exp{b}.csv where:
# Import expression matrices
expr.files <- grep("Raw.exp", list.files("../../data", full.names = TRUE), value = TRUE)
dexpr <- do.call(rbind, strsplit(do.call(rbind, strsplit(expr.files, "/"))[, 4], "_"))
dexpr <- data.frame(patient = dexpr[, 1], section = as.numeric(substr(dexpr[, 3], 1, 1)))
dexpr$id <- paste0(dexpr$patient, "x", dexpr$section)
dexpr$path <- expr.files
conv <- setNames(dexpr$path, nm = dexpr$id)
# Merge meta data
d$expr_path <- conv[d$id]
d <- na.omit(d)
# Display data for html report
DT::datatable(head(d))
# Iterate through each patient
res <- lapply(unique(d$patient), function(patient_id) {
print(patient_id)
d.subset.tmp <- subset(d, patient %in% patient_id)
# Iterate through each section
tst <- lapply(unique(d.subset.tmp$section), function(section_id) {
print(section_id)
# Subset meta data to include correct patient and section number
d.subset <- subset(d, patient %in% patient_id & section %in% section_id)
# Iterate through each cluster and collect spatial coordinates
coords <- do.call(rbind, lapply(1:nrow(d.subset), function(i) {
df <- read.table(d.subset$path[i], sep = ",", header = TRUE)
df <- df[, 1:3]
df$cluster <- d.subset$cluster[i]
return(df)
}))
coords$ID <- paste0(coords$x, "_", coords$y)
coords <- coords[!duplicated(coords$ID), ]
rownames(coords) <- paste0(coords$x, "_", coords$y)
#exprMat <- read.table(unique(d.subset$expr_path), sep = ",", header = TRUE)
# Read expression data
exprMat <- read.table(unique(d.subset$expr_path), sep = ",", header = TRUE, row.names = 1)
colnames(exprMat) <- gsub(pattern = "^X", replacement = "", x = colnames(exprMat))
# Define intersecting spots
spots.keep <- intersect(rownames(coords), colnames(exprMat))
# Subset expression matrix and coords
exprMat <- exprMat[, spots.keep]
coords <- coords[spots.keep, ]
combined <- cbind(coords, t(exprMat))
rownames(combined) <- paste0(combined$Image_ID, "x", combined$ID)
return(combined)
})
# Make sure that only intersecting genes are kept
intersecting.genes <- Reduce(intersect, lapply(tst, colnames))
tst <- do.call(rbind, lapply(tst, function(x) x[, intersecting.genes]))
return(tst)
})
## [1] "RA1"
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] "RA2"
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] "RA3"
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] "RA4"
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] "RA5"
## [1] 1
## [1] 2
## [1] 3
## [1] "RA6"
## [1] 1
## [1] 2
## [1] 3
## [1] 4
# Again, make sure that only intersecting genes are kept but this time
# across the merged patient datasets
intersecting.columns <- Reduce(intersect, lapply(res, colnames))
# Merge patient datasets (10879 columns and 16684 rows)
res.merged <- do.call(rbind, lapply(res, function(x) x[, intersecting.columns]))
# Display data for html report
DT::datatable(res.merged[1:5, 1:10])
Check cluster distributions across all datasets
library(ggplot2)
p <- ggplot(res.merged, aes(x, y, color = factor(cluster))) +
geom_point(size = 0.5) +
facet_wrap(~Image_ID) +
theme_void() +
labs(color = "cluster")
p
png(filename = "../all_clusters/cluster_overview.png", width = 3000, height = 3000, res = 300)
print(p)
dev.off()
## png
## 2
Create metadata for cellphonedb
library(data.table)
# One column with cell name (proxy for "spot") and one column with cluster id
metadata <- data.frame(cell = rownames(res.merged), cluster = res.merged$cluster)
# Add column for cluster 1 vs background comparison
# All clusters except 1 are treated as "background"
metadata$aggr_cluster <- ifelse(metadata$cluster == 1, "cluster_1", "background")
# Add patient id
patient <- substr(metadata$cell, 1, 3)
# Add column for seropositive vs seronegative comparison
metadata$cluster_1 <- paste0(patient, "_", metadata$cluster)
metadata$cluster_1 <- ifelse(metadata$cluster_1 %in% c("RA1_1", "RA2_1", "RA3_1"), "cluster_1_seropositive", metadata$cluster_1)
metadata$cluster_1 <- ifelse(metadata$cluster_1 %in% c("RA4_1", "RA5_1", "RA6_1"), "cluster_1_seronegative", metadata$cluster_1)
# Display data for html report
DT::datatable(head(metadata))
Subset res.merged to include expression data only and transpose for cellphonedb. Add a column named “Gene” to keep gene symbols in.
exprMat <- t(res.merged[, 6:ncol(res.merged)])
exprMat <- data.frame(Gene = rownames(exprMat), exprMat)
Export metadata and expression data for cluster 1 vs background comparison in seropositive patients; RA1, RA2 and RA3
# Export data for sections 1 to 3, cluster 1 vs all
mdata.RA1to3_cluster_1_vs_all <- setNames(metadata[patient %in% c("RA1", "RA2", "RA3"), c("cell", "aggr_cluster")], nm = c("cell", "cluster"))
write.table(x = mdata.RA1to3_cluster_1_vs_all, file = "../../cellphonedb/input/meta_RA1to3_cluster_1_vs_all.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
fwrite(x = exprMat[, c(1, which(patient %in% c("RA1", "RA2", "RA3")) + 1)], file = "../../cellphonedb/input/counts_RA1to3_cluster_1_vs_all.txt", sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
mdata.RA1to3_cluster_1_vs_all <- cbind(mdata.RA1to3_cluster_1_vs_all, res.merged[mdata.RA1to3_cluster_1_vs_all$cell, c("Image_ID", "x", "y")])
# Plot selections for sanity check
p <- ggplot(mdata.RA1to3_cluster_1_vs_all, aes(x, y, color = factor(cluster))) +
geom_point(size = 0.5) +
facet_wrap(~Image_ID) +
theme_void() +
labs(color = "cluster")
p
png(filename = "../all_clusters/cluster_1_vs_all_RA1to3_overview.png", width = 3000, height = 3000, res = 300)
print(p)
dev.off()
## png
## 2
Export metadata and expression data for cluster 1 vs background comparison in seronegative patients; RA4, RA5 and RA6
# Export data for sections 4 to 6, cluster 1 vs all
mdata.RA4to6_cluster_1_vs_all <- setNames(metadata[patient %in% c("RA4", "RA5", "RA6"), c("cell", "aggr_cluster")], nm = c("cell", "cluster"))
write.table(x = mdata.RA4to6_cluster_1_vs_all, file = "../../cellphonedb/input/meta_RA4to6_cluster_1_vs_all.txt", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
fwrite(x = exprMat[, c(1, which(patient %in% c("RA4", "RA5", "RA6")) + 1)], file = "../../cellphonedb/input/counts_RA4to6_cluster_1_vs_all.txt", sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
mdata.RA4to6_cluster_1_vs_all <- cbind(mdata.RA4to6_cluster_1_vs_all, res.merged[mdata.RA4to6_cluster_1_vs_all$cell, c("Image_ID", "x", "y")])
# Plot selections for sanity check
p <- ggplot(mdata.RA4to6_cluster_1_vs_all, aes(x, y, color = factor(cluster))) +
geom_point(size = 0.5) +
facet_wrap(~Image_ID) +
theme_void() +
labs(color = "cluster")
p
png(filename = "../all_clusters/cluster_1_vs_all_RA4to6_overview.png", width = 3000, height = 3000, res = 300)
print(p)
dev.off()
## png
## 2
Export metadata and expression data for cluster 1 (seropositive) vs cluster 1 (seronegative) comparison. All patient datasets included.
# Export data for cluster 1 seropositive vs cluster 1 seronegative
mdata.seropositive_vs_seronegative <- setNames(metadata[metadata$cluster_1 %in% c("cluster_1_seropositive", "cluster_1_seronegative"), c("cell", "cluster_1")], nm = c("cell", "cluster"))
write.table(x = mdata.seropositive_vs_seronegative,
file = "../../cellphonedb/input/meta_seropositive_vs_seronegative.txt",
quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
fwrite(x = exprMat[, c(1, which(metadata$cluster_1 %in% c("cluster_1_seropositive", "cluster_1_seronegative")) + 1)],
file = "../../cellphonedb/input/counts_seropositive_vs_seronegative.txt",
sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
mdata.seropositive_vs_seronegative <- cbind(mdata.seropositive_vs_seronegative, res.merged[mdata.seropositive_vs_seronegative$cell, c("Image_ID", "x", "y")])
# Plot selections for sanity check
p <- ggplot(mdata.seropositive_vs_seronegative, aes(x, y, color = factor(cluster))) +
geom_point(size = 0.5) +
facet_wrap(~Image_ID) +
theme_void() +
labs(color = "cluster")
p
png(filename = "../all_clusters/cluster_1_seropositive_vs_seronegative.png", width = 3000, height = 3000, res = 300)
print(p)
dev.off()
## png
## 2
date()
## [1] "Wed Sep 15 15:48:47 2021"
sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS Linux 8
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.3.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] data.table_1.14.0 ggplot2_3.3.3
##
## loaded via a namespace (and not attached):
## [1] bslib_0.2.4 compiler_4.0.4 pillar_1.5.0 jquerylib_0.1.3
## [5] tools_4.0.4 digest_0.6.27 jsonlite_1.7.2 evaluate_0.14
## [9] lifecycle_1.0.0 tibble_3.0.6 gtable_0.3.0 pkgconfig_2.0.3
## [13] rlang_0.4.10 DBI_1.1.1 crosstalk_1.1.1 yaml_2.2.1
## [17] xfun_0.20 withr_2.4.1 stringr_1.4.0 dplyr_1.0.4
## [21] knitr_1.30 generics_0.1.0 htmlwidgets_1.5.3 sass_0.3.1
## [25] vctrs_0.3.6 grid_4.0.4 DT_0.17 tidyselect_1.1.0
## [29] glue_1.4.2 R6_2.5.0 fansi_0.4.2 rmarkdown_2.7
## [33] farver_2.0.3 purrr_0.3.4 magrittr_2.0.1 scales_1.1.1
## [37] htmltools_0.5.1.1 ellipsis_0.3.1 assertthat_0.2.1 colorspace_2.0-0
## [41] labeling_0.4.2 utf8_1.1.4 stringi_1.5.3 munsell_0.5.0
## [45] crayon_1.4.1